BIOS611 project

This project was created for 611 class, fall 2024. The data set I am using is a eye tracking data from 45 healthy adults. During the data collection participants read 6 short texts twice from a given perspective (of a real estate buyer or a baby sitter), then switched their perspective and read the texts again. The data set consists of 39 variables and 76517 observations. My goal is to see the patterns in the data which will help us understand the mechanisms of text processing better. Namely I am interested to look into eye tracking processing data which might predict attention allocation during text rereading.

data source

setwd("~/work")
library(tidyverse);
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.4     v readr     2.1.5
## v forcats   1.0.0     v stringr   1.5.1
## v ggplot2   3.5.1     v tibble    3.2.1
## v lubridate 1.9.3     v tidyr     1.3.1
## v purrr     1.0.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2);
cleaned_data <- read.csv ("./cleaned_data2.csv");

set the labels

SentenceRelevance_labels <- c("0 Neutral", "1 Buyer", "2 Sitter")
SubjectPerspective_labels <- c("Started with Sitter", "Started with Buyer")
CongruencyLevels_labels <- c("Non-congruent", "Neutral", "Congruent")
cleaned_data$CongruencyLevels <- factor(cleaned_data$CongruencyLevels)

Do PCA

Here explain why you do it

numeric_cols <- dplyr::select(cleaned_data, IA_First_Fixation_Time, FFdur, F2dur, FPNfix, GZD, regrdur, TTime, Skip)
numeric_cols_scaled <- scale(numeric_cols)
numeric_cols_scaled_clean <- numeric_cols_scaled[complete.cases(numeric_cols_scaled) & 
                                                   !apply(numeric_cols_scaled, 1, function(x) any(is.infinite(x))), ]
pca_result <- prcomp(numeric_cols_scaled_clean, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6     PC7
## Standard deviation     1.5188 1.1686 1.0537 0.9995 0.9398 0.9136 0.67017
## Proportion of Variance 0.2883 0.1707 0.1388 0.1249 0.1104 0.1043 0.05614
## Cumulative Proportion  0.2883 0.4590 0.5978 0.7227 0.8331 0.9374 0.99355
##                            PC8
## Standard deviation     0.22710
## Proportion of Variance 0.00645
## Cumulative Proportion  1.00000
pca_result$rotation
##                                PC1        PC2         PC3         PC4
## IA_First_Fixation_Time  0.01128081 -0.1856810  0.69930259  0.27025185
## FFdur                   0.44679376  0.0570400  0.02596974  0.29994887
## F2dur                   0.25710100  0.5480737  0.16948165  0.06398679
## FPNfix                  0.37824933 -0.5422693 -0.16473380 -0.30448658
## GZD                     0.60907500 -0.2549171 -0.08242947 -0.04180739
## regrdur                 0.05041756  0.3689164 -0.05419034 -0.72068647
## TTime                   0.46553590  0.3889549  0.07089244  0.07904674
## Skip                   -0.02892479  0.1243391 -0.66308616  0.46130575
##                                 PC5         PC6         PC7           PC8
## IA_First_Fixation_Time -0.051389872  0.63291191 -0.01076196  0.0009454112
## FFdur                  -0.647853415 -0.19537914  0.27046393 -0.4188558581
## F2dur                   0.566801895 -0.00347724  0.50948824 -0.1408588771
## FPNfix                  0.343767818  0.17414914 -0.04526499 -0.5408039806
## GZD                    -0.008680241  0.02503446  0.20717001  0.7154585663
## regrdur                -0.361883241  0.44709835  0.09022667 -0.0059937050
## TTime                   0.082428880 -0.01286963 -0.78324800 -0.0166934411
## Skip                    0.019364539  0.57464790  0.02384443 -0.0072905416
pca_scores <- pca_result$x
cleaned_data_aligned <- cleaned_data[complete.cases(numeric_cols_scaled) &
                                       !apply(numeric_cols_scaled, 1, function(x) any(is.infinite(x))), ]
pca_plot_data <- cbind(pca_scores, Reading = cleaned_data_aligned$Reading)
pca_plot_data <- as.data.frame(pca_plot_data)

biplot

plot0 <- biplot(pca_result, scale = 0)

ggsave("biplot.png", plot=plot0)
## Saving 7 x 5 in image

# plot PC1 and PC2

plot <- ggplot(pca_plot_data, aes(x = PC1, y = PC2)) +
  geom_point(aes(color = Reading), size = 3) +
  labs(
    title = "PCA: PC1 vs PC2",
    x = "Principal Component 1",
    y = "Principal Component 2",
    color = "Reading"
  ) +
  theme_minimal()
ggsave("figures/PC1vsPC2.png", plot=plot)
## Saving 7 x 5 in image
plot2 <- ggplot(pca_plot_data, aes(x = PC1, y = PC3)) +
  geom_point(aes(color = Reading), size = 3) +
  labs(
    title = "PCA: PC1 vs PC3",
    x = "Principal Component 1",
    y = "Principal Component 3",
    color = "Reading"
  ) +
  theme_minimal()
ggsave("figures/PC1vsPC3.png", plot=plot2)
## Saving 7 x 5 in image
plot3 <- ggplot(pca_plot_data, aes(x = PC2, y = PC3)) +
  geom_point(aes(color = Reading), size = 3) +
  labs(
    title = "PCA: PC2 vs PC3",
    x = "Principal Component 2",
    y = "Principal Component 3",
    color = "Reading"
  ) +
  theme_minimal()
ggsave("figures/PC2vsPC3.png", plot=plot3)
## Saving 7 x 5 in image

kmeans

reduced_data <- as.data.frame(pca_scores[, 1:2]) 
set.seed(123) 
# Apply k-means clustering with 3 clusters (adjust k based on your use case)
kmeans_result <- kmeans(reduced_data, centers = 3, nstart = 25)
# Add the cluster assignments to the data
reduced_data$Cluster <- as.factor(kmeans_result$cluster)
# View the cluster centers
kmeans_result$centers
##          PC1        PC2
## 1 -0.7840434  0.4310042
## 2  0.4285204 -1.0874878
## 3  3.0017653  0.7794242

##plot

plot4 <- ggplot(reduced_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3) +
  labs(
    title = "K-means Clustering on Principal Components",
    x = "Principal Component 1",
    y = "Principal Component 2",
    color = "Cluster"
  ) +
  theme_minimal()
ggsave("figures/K-means.png", plot=plot4)
## Saving 7 x 5 in image

Conclusions

This requires further analysis. I’ll map the clusters to the original data, analyze their characteristics, and determine associations with any specific variables.